clear all
set more off

***********************************************************
//// 05-PROCESS_DRIVER: CONSTRUCT DRIVER INFORMATION ////
***********************************************************



***************************************************
***************************************************
//// STEP A: CONSTRUCT HISPANIC STATUS MEASURE ////
//// USING SURNAME FREQUENCY IN 2010 CENSUS    ////
***************************************************
***************************************************

**********************************************
/// PART 3A: READ IN RAW CENSUS NAMES DATA ///
**********************************************

// READ IN DATA //
insheet using "${raw}/census/Names_2010Census.csv", clear 

// REFORMATTING //
ren name Name
destring pcthispanic, gen(PctHisp) force

// KEEP NAMES THAT ARE 80+ PERCENT HISPANIC //
keep if PctHisp>=80 & !mi(PctHisp)

// KEEP NAMES ONLY + STORE TEMPORARY FILE //
keep Name
tempfile CENSUS 
save `CENSUS'


**********************************************************
/// PART 3B: GET LIST OF NAMES FROM TRAFFIC STOPS DATA ///
**********************************************************

// READ IN STOPS DATA //
use "${temp}/process-citation", clear
// KEEP FHP ONLY //
keep if IssueAgencyType==1

// KEEP UNIQUE DRIVER LAST NAMES //
keep DriverLast
duplicates drop
drop if mi(DriverLast)


*******************************************
/// PART 3C: PREPROCESSING CENSUS NAMES ///
*******************************************

// FORMATTING //
gen Last=upper(trim(Driver))
replace Last = stritrim(Last)
replace Last = subinstr(Last, "-", " ", .)
replace Last="SMITH" if Last=="SMITH                           BPO REST"
replace Last="CRUZ" if Last=="CRUZ                           BPO RESTR"
replace Last="EXILUS" if Last=="EXILUS                     BPO RESTRICTI"

// REMOVING JR AND SR //
gen Z=substr(Last,-2,.)
replace Last=substr(Last,1,length(Last)-1) if Z==" J"
replace Last=substr(Last,1,length(Last)-1) if Z==" S"
replace Last=trim(Last)
drop Z

// SPLIT INTO PIECES WHEN THERE ARE MULTIPLE NAMES LISTED //
split Last, parse(" ") gen(Name_)


********************************************************************
/// PART 3D: USE MERGE WITH TEMPFILE TO IDENTIFY HISPANIC STATUS ///
********************************************************************

forval i=1/7 {

rename Name_`i' Name
merge m:1 Name using `CENSUS', gen(Merge_`i')
gen Hisp_`i' = (Merge_`i'==3)
drop Merge_`i'
rename Name Name_`i'

}

// CODE AS HISPANIC IF ANY OF THE SUB-MERGES WAS SUCCESSFUL //
forval i=1/6 {
replace Hisp_`i' = 0 if mi(Hisp_`i')
}
egen AnyHisp=rowmax(Hisp_*)


//////// STORE DATASET ///////////
keep DriverLast AnyHisp
drop if mi(Driver)
ren AnyHisp CensusHisp
label var CensusHisp "Hispanic Surname"
saveold "${temp}/TEMP-HISPANIC", replace





***************************************************
***************************************************
//// STEP B: CONSTRUCT PROXY INCOME MEASURE    ////
//// USING ZIP-CODE PCI FROM IRS TAX RETURNS   ////
***************************************************
***************************************************

******************************************************
/// READ IN RAW DATA FILES  YEAR-BY-YEAR ///
/// FOR EACH YEAR, READ IN FILE, 
/// COMPUTE TOTAL AGI AND TOTAL RETURNS BY ZIP CODE,
/// STORE TEMPORARY FILE 
******************************************************


/// 2005-2007 ARE IN SAME FORMAT ///

local k=1
foreach x in 05 06 07 {
	
insheet using "${raw}/IRS/zipcode`x'.csv", clear
collapse (sum) n1 a00100, by(zipcode)
gen year=2000+`x'
ren n1 nReturns
ren a00100 Agi

tempfile TEMP`k'
save `TEMP`k''

local ++k

}


/// 2008 IN DIFFERENT FORMAT ///

insheet using "${raw}/IRS/08zpall.csv", clear
collapse (sum) n1 a00100, by(zipcode)
gen year=2008
ren n1 nReturns
ren a00100 Agi
tempfile TEMP`k'
save `TEMP`k''
local ++k



/// 2009-2014 IN SAME FORMAT ///

foreach x in 09 10 11 12 13 14 {

insheet using "${raw}/IRS/`x'zpallagi.csv", clear
collapse (sum) n1 a00100, by(zipcode)
gen year=2000+`x'
ren n1 nReturns
ren a00100 Agi
	
tempfile TEMP`k'
save `TEMP`k''
local ++k

}


/// 2014 IS LATEST YEAR ///
/// DUPLICATE 2014 FOR 2015 ///

insheet using "${raw}/IRS/14zpallagi.csv", clear
collapse (sum) n1 a00100, by(zipcode)
gen year=2015
ren n1 nReturns
ren a00100 Agi
	
tempfile TEMP`k'
save `TEMP`k''



*************************************
//// APPEND DATA, FORMAT, STORE ////
************************************

use `TEMP1', clear
forval i=2/`k' {
qui append using `TEMP`i''
}

ren zipcode Zip
gen ZipIncome = Agi/nReturns
ren year Year


//// NOTE: HAVE TO RESCALE 2007-2008 FOR CONTINUITY ACROSS YEARS ////
replace ZipIncome = ZipIncome/1000 if Year>=2007 & Year<=2008


/// STORE //
qui compress
saveold "${temp}/TEMP-IRS", replace




**************************************************
**************************************************
//// STEP C: CONSTRUCT PRIOR PRISON MEASURE  ////
//// USING DATA ON RELEASES FROM FDOC        ////
*************************************************
**************************************************


//// DATA ON PRISON RELEASES ////

import delimited using "${raw}/FDOC/INMATE_RELEASE_ROOT_Jan_2021.txt", clear varn(1)

gen last=trim(upper(lastname))
gen first=trim(upper(firstname))
gen l2=substr(last,1,2)
gen f2=substr(first,1,2)

gen BirthDate=date(birthdate,"MDY")
format %td BirthDate
gen releasedate=date(prisonreleasedate,"MDY")
format %td releasedate

keep l2 f2 BirthDate last first releasedate

qui compress
tempfile PRISON
save `PRISON'


//// MATCH TO DRIVER INFO IN CITATIONS FILE ///

use "${temp}/process-citation", clear

keep if DLstate=="FL"
keep CitationNumber CountyNum OffenseDate DriverLast DriverFirst BirthDate
replace DriverLast = trim(upper(DriverLast))
replace DriverFirst = trim(upper(DriverFirst))
gen l2 = substr(DriverLast,1,2)
gen f2 = substr(DriverFirst,1,2)

* CONSTRUCT ALL MATCHES ON DOB, FIRST TWO CHARS OF FIRST/LAST *
joinby BirthDate l2 f2 using `PRISON'

* DROP IF RELEASE IS AFTER THE TICKET DATE *
keep if OffenseDate>releasedate

* STRORE AS MATCH ONLY IF NAMES ARE SUFFICIENTLY CLOSE *
ustrdist DriverFirstName first, gen(fdist)
ustrdist DriverLastName  last,  gen(ldist)
keep if fdist<=2 & ldist<=2


//// STORE MATCHES ///
gen PriorPrison=1
collapse (max) PriorPrison, by(CitationNumber CountyNum)
qui compress
saveold "${temp}/TEMP-PRISON", replace




**************************************************
**************************************************
//// STEP D: INFO ON COUNTY OF RESIDENCE      ////
*************************************************
**************************************************


///// CROSSWALK BETWEEN FIPS AND UTC COUNTY CODES /////

import delim using "${raw}/geo/CENSUS_COUNTY_2010.csv", clear

gen fips=v2*1000+v3
keep if v1=="FL"

ren v4 NAME
replace NAME=trim(upper(NAME))
replace NAME=regexr(NAME," COUNTY","")
replace NAME=trim(upper(NAME))

keep NAME fips
tempfile X
save `X'

import excel using "${raw}/geo/UTC_CountyNum.xlsx", clear first

gen NAME=trim(upper(County))
replace NAME="MIAMI-DADE" if County=="DADE"

merge 1:1 NAME using `X', nogen
keep CountyNum fips
tempfile FIPSUTCXW
save `FIPSUTCXW'


///// CROSSWALK BETWEEN ZIPS AND COUNTIES /////

/// CENSUS COUNTIES ///
import delim using "${raw}/geo/CENSUS_COUNTY_2010.csv", clear

ren v1 state
ren v4 name
gen fips=v2*1000+v3
keep state name fips

tempfile LIST
save `LIST'


/// ZIP CODES //
import excel using "${raw}/geo/USPS_ZIP_COUNTY_2010.xlsx", clear first

/// BEST MATCH ///
gen double RATIO=RES_RATIO
bysort ZIP: egen double MAX=max(RATIO)

/// TEST THAT IT WORKS ///
gen MATCH=RATIO==MAX
bysort ZIP: egen ANYMATCH=max(MATCH)
tab ANYMATCH

/// KEEP ONLY MATCHES ///
keep if MATCH==1
drop RATIO MAX MATCH ANYMATCH 

/// DUPLICATES ///
gsort ZIP -BUS_RATIO -TOT_RATIO
duplicates drop ZIP, force


/// MERGE 1 ///
destring COUNTY, gen(fips)
merge m:1 fips using `LIST', keep(3) nogen
merge m:1 fips using `FIPSUTCXW', keep(1 3) nogen

/// CLEAN UP ///
drop *_RATIO
drop COUNTY
ren ZIP Zip
order Zip state fips name CountyNum 
ren CountyNum HomeCountyNum
destring Zip, replace force
qui compress
saveold "${temp}/TEMP-COUNTY", replace




******************************************************
///// COMPILE TOGETHER INTO MASTER DRIVER FILE //////
******************************************************

use "${temp}/process-citation", clear
gen Year = year(OffenseDate)
keep CitationNumber CountyNum Year DriverLast ZipCode

merge m:1 DriverLast using "${temp}/TEMP-HISPANIC", keep(1 3) nogen
replace CensusHisp = 0 if mi(CensusHisp)

tostring ZipCode, gen(tempZip)
replace tempZip=trim(tempZip)
gen Zip = substr(tempZip,1,5)
destring Zip, replace force

merge m:1 Zip Year using "${temp}/TEMP-IRS", keep(1 3) nogen
gen ZipIncomeMiss = mi(ZipIncome)
bysort Year: egen muZipIncome = mean(ZipIncome)
replace ZipIncome = muZipIncome if mi(ZipIncome)
drop Year muZipIncome
summ ZipIncome
replace ZipIncome = r(mean) if mi(ZipIncome)

merge 1:1 CitationNumber CountyNum using "${temp}/TEMP-PRISON", keep(1 3) nogen
replace PriorPrison = 0 if mi(PriorPrison)

keep CitationNumber CountyNum CensusHisp Zip ZipIncome ZipIncomeMiss PriorPrison

merge m:1 Zip using "${temp}/TEMP-COUNTY", keep(1 3) nogen
gen ZipFlorida = (state=="FL")
gen HomeFips = fips

keep  CitationNumber-PriorPrison ZipFlorida HomeFips HomeCountyNum
order CitationNumber-PriorPrison ZipFlorida HomeFips HomeCountyNum


qui compress
saveold "${temp}/step-2-driver", replace


rm "${temp}/TEMP-HISPANIC.dta"
rm "${temp}/TEMP-IRS.dta"
rm "${temp}/TEMP-PRISON.dta"
rm "${temp}/TEMP-COUNTY.dta"











